* Title: 	census_acs_clean.do
* Version: 	23 May 2018
* Purpose: 	Clean data from IPUMS Census/ACS extracts and create new variables


*******************************************************************************
* (0) Start of file
*******************************************************************************

capture log close
log using log/census_acs_clean, replace
set more off
clear all
macro drop _all

foreach dataset in census80 census90 census00 acs {

	*******************************************************************************
	* (1) Load data
	*******************************************************************************

	use year gq statefip puma conspuma cpuma ///
		age sex marst hispan race ///
		age_sp* age_mom* age_pop* ///
		educ* ///
		empstat* wkswork2 workedyr uhrswork ind1990 ///
		disabwrk vetdisab diff* ///
		inctot incwage ///
		mig* ///
		perwt ///
		using dta/raw/`dataset'_raw, clear

	if "`dataset'" == "census80" assert year==1980
	if "`dataset'" == "census90" assert year==1990
	if "`dataset'" == "census00" assert year==2000
	* For acs, keep only 2005-2016
	if "`dataset'" == "acs"	keep if year>=2005 & year<=2016

	
	*******************************************************************************
	* (2) Restrict sample
	*******************************************************************************

	* Keep only ages 16+
	keep if age>=16 & !mi(age)

	* Drop armed forces
	drop if (empstatd >= 13 & empstatd <= 15)

	* Drop institutional group quarters
	drop if gq ==3
	
	
	*******************************************************************************
	* (3) Create data labels
	*******************************************************************************

	label define yesno 1 yes 0 no
	label define maritallevels 1 Never_Married 2 Married 3 Separated_Divorced /// 
		  4 Widowed
	label define racegroups 1 "White" 2 "African-American" 3 "Hispanic" 4 "Other"
	label define edlevels 1 Less_than_HS 2 High_School 3 Some_College ///
		  4 College_4yr_Grad
	label define indgroups 1 PrimaryInd 2 Construction 3 NondurableMan ///
		  4 DurableMan 5 Transport 6 Wholesale 7 Retail 8 Finance 9 Business /// 
		  10 Personal 11 Professional 12 Public


	*******************************************************************************
	* (4) Create new variables (core/geography)
	*******************************************************************************

	* Merge with indicator for region
	merge m:1 statefip using dta/state_region, assert(matched using) keep(matched) nogen

	
	*******************************************************************************
	* (5) Create new variables (demographics)
	*******************************************************************************

	* Create dummy for prime-age male
	generate PrimeMaleD = (age >=25 & age <=54 & sex == 1)

	label var PrimeMaleD "Dummy for Prime Male"
	notes PrimeMaleD: Dummy for prime male group 
	label values PrimeMaleD yesno

	* Create dummy for prime-age female
	generate PrimeFemaleD = (age >=25 & age <=54 & sex == 2)

	label var PrimeFemaleD "Dummy for Prime Female"
	notes PrimeFemaleD: Dummy for prime female group  
	label values PrimeFemaleD yesno

	* Create variable for marital status
	generate MaritalI = (marst == 6)
	replace MaritalI = 2 if (marst == 1 | marst == 2)
	replace MaritalI = 3 if (marst == 3 | marst == 4)
	replace MaritalI = 4 if (marst == 5)

	label var MaritalI "Indicator for marital status"
	notes MaritalI: Indicator variable for marital status 
	label values MaritalI maritallevels

	* Create dummy for Hispanic status
	generate HispanicB = (hispan > 0 & hispan < 9)

	label var HispanicB "Dummy for Hispanic status"
	notes HispanicB: Dummy for Hispanic status  
	label values HispanicB yesno

	* Create indicator for race
	generate RaceI = 4
	replace RaceI = 1 if (race == 1 & HispanicB == 0)
	replace RaceI = 2 if (race == 2 & HispanicB == 0)
	replace RaceI = 3 if (HispanicB == 1)

	label var RaceI "Indicator for race"
	notes RaceI: Reported race  
	label values RaceI racegroups

	* Create dummy for living with spouse
	generate SpouseB = (age_sp > 0 & age_sp < 120)

	label var SpouseB "Dummy for living with spouse"
	notes SpouseB: Dummy for living with spouse
	label values SpouseB yesno

	* Generate dummy for living with parent
	generate ParentB = ((age_mom > 1 & age_mom < 120) ///
						| (age_mom2 > 1 & age_mom2 < 120) ///
						| (age_pop > 1 & age_pop < 120) ///
						| (age_pop2 > 1 & age_pop2 < 120))

	label var ParentB "Dummy for living parent"
	notes ParentB: Dummy for living parent
	label values ParentB yesno

	* Create dummy for living with parents
	generate SpouseAndParentB = (SpouseB == 1) & (ParentB == 1)

	label var SpouseAndParentB "Dummy for living with spouse and parent"
	notes SpouseAndParentB: Dummy for living with spouse and parent
	label values SpouseAndParentB yesno		 


	*******************************************************************************
	* (6) Create new variables (education)
	*******************************************************************************

	* Create variable for Education level
	generate EducationI = (educ >= 0 & educ < 6 | educd == 61)
	replace EducationI = 2 if (educ == 6 & educd != 61)
	replace EducationI = 3 if (educ >=7 & educ <= 9)
	replace EducationI = 4 if (educ >= 10 & educ <= 11)

	label var EducationI "Indicator for education"
	notes EducationI: Indicator variable for education levels
	label values EducationI edlevels


	*******************************************************************************
	* (7) Create new variables (labor force/disability)
	*******************************************************************************

	* Create dummy for non-employment
	generate UnempB = .
	replace UnempB = 1 if (empstat == 2 | empstat ==3)
	replace UnempB = 0 if (empstat == 1)

	label var UnempB "Dummy for not employed"
	notes UnempB: Dummy for not employed 
	label values UnempB yesno

	* Create dummy for employment
	generate EmpB = 1-UnempB

	label var EmpB "Dummy for employed"
	notes EmpB: Dummy for employed \ ASEC_clean.do BA TS 
	label values EmpB yesno

	* Create dummy for work in last 12 months
	generate LastWorkB = .
	replace LastWorkB = (wkswork2 > 0 | workedyr == 3)
	replace LastWorkB = 0 if (workedyr == 1 | workedyr == 2)

	label var LastWorkB "Dummy for worked in past 12 months"
	notes LastWorkB: Dummy for worked in past 12 months  
	label values LastWorkB yesno

	* Create dummy for long-term jobless (not employed, no work in past 12 months)
	generate LTJoblessB = 0
	replace LTJoblessB = (LastWorkB == 0 & UnempB == 1)

	label var  LTJoblessB "Dummy for >12 months jobless"
	notes  LTJoblessB: Dummy for jobless and no work in past 12 months  
	label values  LTJoblessB yesno

	* Create indicator variable for industry (only for employed)
	generate	IndustryI = .
	replace 	IndustryI = 1  if (ind1990 >= 010 & ind1990 <= 050) & EmpB==1
	replace 	IndustryI = 2  if (ind1990 == 060) 					& EmpB==1
	replace 	IndustryI = 3  if (ind1990 >= 100 & ind1990 <= 229) & EmpB==1
	replace 	IndustryI = 4  if (ind1990 >= 230 & ind1990 <= 392) & EmpB==1
	replace 	IndustryI = 5  if (ind1990 >= 400 & ind1990 <= 472) & EmpB==1
	replace 	IndustryI = 6  if (ind1990 >= 500 & ind1990 <= 571) & EmpB==1
	replace 	IndustryI = 7  if (ind1990 >= 580 & ind1990 <= 691) & EmpB==1
	replace 	IndustryI = 8  if (ind1990 >= 700 & ind1990 <= 712) & EmpB==1
	replace 	IndustryI = 9  if (ind1990 >= 721 & ind1990 <= 760) & EmpB==1
	replace 	IndustryI = 10 if (ind1990 >= 761 & ind1990 <= 810) & EmpB==1
	replace 	IndustryI = 11 if (ind1990 >= 812 & ind1990 <= 893) & EmpB==1
	replace 	IndustryI = 12 if (ind1990 >= 900 & ind1990 <= 932) & EmpB==1
	assert !mi(IndustryI) if EmpB==1

	label var IndustryI "Indicator for industry"
	notes IndustryI: Indicator variable for industry 
	label values IndustryI indgroups

	* Create dummy for disability based on reported disability
	generate DisabilityB = ((disabwrk >= 2 & disabwrk <= 4) | (vetdisab >= 3 & vetdisab <= 9) ///
						   | diffrem == 2 | diffphys == 2 | diffmob == 2 | diffcare == 2 | ///
							 diffsens == 2 | diffeye == 2 | diffhear == 2)

	label var DisabilityB "Self reported disability dummy"
	notes DisabilityB: Dummy for disability based on self reports
	label values DisabilityB yesno

	*******************************************************************************
	* (8) Create new variables (income)
	*******************************************************************************
	
	* Merge with CPI factor to get real incomes
	merge m:1 year using dta/cpi_factor, assert(matched using) keep(matched) nogen
	
	* Create variables for real total and wage income (excluding N/A values), all individuals,
	* all employed, and full-time/full-year/positive only
	foreach var of varlist inctot incwage {
		
		replace `var' = . if (`var' == 999998 | `var' == 999999)
		
		gen double `var'_real			= `var' * CPI_Factor  	
		gen double `var'_real_emp		= `var' * CPI_Factor if (EmpB == 1)		
		gen double `var'_real_ftfy_pos 	= `var' * CPI_Factor if (EmpB == 1 & wkswork2 == 6 & uhrswork >= 35 & `var' > 0)

	}
		

	*******************************************************************************
	* (9) Create new variables (migration)
	*******************************************************************************
	
	if "`dataset'" == "census80" | "`dataset'" == "census90" | "`dataset'" == "census00" {
	
		* Create dummy for interstate migration (5 yr)
		generate InterstateB = .
		replace InterstateB = 0 if (migrate5 == 1 | migrate5 == 2 | migrate5 == 4)
		replace InterstateB = 1 if (migrate5 == 3) 

		label var InterstateB "Moved insterstate 5yrs"
		notes InterstateB: Dummy for moving interstate 
		label values InterstateB yesno	

		* Create dummy for international migration (5 yr)
		generate InternationalB = .
		replace InternationalB = 0 if (migrate5 == 1 | migrate5 == 2 | migrate5 == 3)
		replace InternationalB = 1 if (migrate5 == 4) 

		label var InternationalB "Moved International 5yrs"
		notes InternationalB: Dummy for moving interstate 
		label values InternationalB yesno	

		* Create dummy for insterstate / international migration (5 yr)
		generate MigrationB = .
		replace MigrationB = 0 if (migrate5 == 1 | migrate5 == 2)
		replace MigrationB = 1 if (migrate5 == 3 | migrate5 == 4) 

		label var MigrationB "Moved 5yrs interstate/international"
		notes MigrationB: Dummy for moving from 5 years ago 
		label values MigrationB yesno
		
	}
	
	if "`dataset'" == "acs" {
	
		* Create dummy for interstate migration (1 yr)
		generate InterstateB = .
		replace InterstateB = 0 if (migrate1 == 1 | migrate1 == 2 | migrate1 == 4)
		replace InterstateB = 1 if (migrate1 == 3) 

		label var InterstateB "Moved insterstate 1yr"
		notes InterstateB: Dummy for moving interstate 
		label values InterstateB yesno	

		* Create dummy for international migration (1 yr)
		generate InternationalB = .
		replace InternationalB = 0 if (migrate1 == 1 | migrate1 == 2 | migrate1 == 3)
		replace InternationalB = 1 if (migrate1 == 4) 

		label var InternationalB "Moved international 1yr"
		notes InternationalB: Dummy for moving internationally
		label values InternationalB yesno	

		* Create dummy for insterstate / international migration (1 yr)
		generate MigrationB = .
		replace MigrationB = 0 if (migrate1 == 1 | migrate1 == 2)
		replace MigrationB = 1 if (migrate1 == 3 | migrate1 == 4) 

		label var MigrationB "Moved 1yr interstate / international"
		notes MigrationB: Dummy for moving from 1 years ago 
		label values MigrationB yesno	
		
	}


	*******************************************************************************
	* (10) Save data
	*******************************************************************************
	assert !mi(UnempB)
	keep year statefip puma conspuma cpuma0010 perwt age mig* RegionI-MigrationB
	compress
	
	label data "Create `dataset' data \ 05-23-2018"
	notes: `dataset'_clean ///
		   \ census_acs_clean.do \ BA TS 
	datasignature set
	save dta/`dataset'_clean, replace
}

*******************************************************************************
* (11) Create file with 1980/1990/2000 Census and 2006-2008, 2009-2011 ACS
*******************************************************************************

use dta/acs_clean.dta, clear
keep if (year>=2006 & year<=2008) | (year>=2009 & year<=2011)

foreach dataset in census80 census90 census00 { 
	append using dta/`dataset'_clean.dta
	if c(os)=="Windows"	shell erase dta/`dataset'_clean.dta
	if c(os)=="MacOSX" | c(os)=="Unix" shell rm dta/`dataset'_clean.dta
}

label data "Create Census data \ 05-23-2018"
notes: `dataset'_clean ///
	   \ census_acs_clean.do \ BA TS 
datasignature set, reset
save dta/census_clean, replace

*******************************************************************************
* (12) End of file
*******************************************************************************

log close
exit, clear
